{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## indexers\n",
    "Basic indexing and searching adapted from [lucene's documentation](https://lucene.apache.org/core/8_7_0/core/index.html).\n",
    "\n",
    "### lucene"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "import lucene\n",
    "from org.apache.lucene import analysis, document, index, queryparser, search, store\n",
    "from lupyne import engine\n",
    "\n",
    "assert lucene.getVMEnv() or lucene.initVM()\n",
    "    \n",
    "analyzer = analysis.standard.StandardAnalyzer()\n",
    "\n",
    "directory = store.RAMDirectory()\n",
    "config = index.IndexWriterConfig(analyzer)\n",
    "iwriter = index.IndexWriter(directory, config)\n",
    "doc = document.Document()\n",
    "text = \"This is the text to be indexed.\"\n",
    "doc.add(document.Field('fieldname', text, document.TextField.TYPE_STORED))\n",
    "iwriter.addDocument(doc)\n",
    "iwriter.close()\n",
    "\n",
    "# Now search the index:\n",
    "ireader = index.DirectoryReader.open(directory)\n",
    "isearcher = search.IndexSearcher(ireader)\n",
    "# Parse a simple query that searches for \"text\":\n",
    "parser = queryparser.classic.QueryParser('fieldname', analyzer)\n",
    "query = parser.parse('text')\n",
    "hits = isearcher.search(query, 10).scoreDocs\n",
    "assert len(hits) == 1\n",
    "# Iterate through the results:\n",
    "for hit in hits:\n",
    "    hitDoc = isearcher.doc(hit.doc)\n",
    "    assert hitDoc['fieldname'] == text\n",
    "ireader.close()\n",
    "directory.close()"
   ],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### lupyne"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "indexer = engine.Indexer()  # Indexer combines Writer and Searcher; RAMDirectory and StandardAnalyzer are defaults\n",
    "indexer.set('fieldname', engine.Field.Text, stored=True)  # default indexed text settings for documents\n",
    "indexer.add(fieldname=text)  # add document\n",
    "indexer.commit()  # commit changes and refresh searcher\n",
    "\n",
    "hits = indexer.search('text', field='fieldname')  # parsing handled if necessary\n",
    "assert len(hits) == 1\n",
    "for hit in hits:  # hits support mapping interface\n",
    "    assert hit['fieldname'] == text\n",
    "# closing is handled automatically"
   ],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## queries\n",
    "Classmethods for convenient query building. Operator overloading is used for combining boolean clauses, provided at least one of the queries is wrapped by lupyne.\n",
    "\n",
    "### lucene"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "from org.apache.lucene.search import spans\n",
    "\n",
    "q1 = search.TermQuery(index.Term('text', 'lucene'))\n",
    "q2 = search.PhraseQuery.Builder() \\\n",
    "    .add(index.Term('text', 'search')) \\\n",
    "    .add(index.Term('text', 'engine')) \\\n",
    "    .build()\n",
    "search.BooleanQuery.Builder() \\\n",
    "    .add(q1, search.BooleanClause.Occur.MUST) \\\n",
    "    .add(q2, search.BooleanClause.Occur.MUST) \\\n",
    "    .build()"
   ],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "q1 = spans.SpanTermQuery(index.Term('text', 'hello'))\n",
    "q2 = spans.SpanTermQuery(index.Term('text', 'world'))\n",
    "q3 = spans.SpanPositionRangeQuery(q1, 0, 10)\n",
    "q4 = spans.SpanNearQuery([q1, q2], 0, True)\n",
    "spans.SpanNotQuery(q3, q4)"
   ],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### lupyne"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "Q = engine.Query\n",
    "\n",
    "Q.term('text', 'lucene') & Q.phrase('text', 'search', 'engine')"
   ],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "Q.span('text', 'hello')[:10] - Q.near('text', 'hello', 'world')"
   ],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## searching\n",
    "Advanced searching with custom fields.\n",
    "\n",
    "Lupyne SpatialFields and DateTimeFields are implemented as lucene Point fields.\n",
    "NestedFields simulate a composite index.\n",
    "The fields have convenience methods for creating prefix and range queries."
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import date\n",
    "\n",
    "docs = [\n",
    "    {'city': 'San Francisco', 'state': 'CA', 'incorporated': '1850-04-15', 'population': 808976, 'longitude': -122.4192, 'latitude': 37.7752},\n",
    "    {'city': 'Los Angeles', 'state': 'CA', 'incorporated': '1850-04-04', 'population': 3849378, 'longitude': -118.2434, 'latitude': 34.0521},\n",
    "    {'city': 'Portland', 'state': 'OR', 'incorporated': '1851-02-08', 'population': 575930, 'longitude': -122.6703, 'latitude': 45.5238},\n",
    "]\n",
    "\n",
    "indexer = engine.Indexer()\n",
    "indexer.set('city', stored=True)\n",
    "indexer.set('state', stored=True)\n",
    "# set method supports custom field types inheriting their default settings\n",
    "indexer.set('incorporated', engine.DateTimeField)\n",
    "indexer.set('year-month-day', engine.NestedField, sep='-')\n",
    "indexer.set('population', dimensions=1)\n",
    "indexer.set('point', engine.SpatialField)\n",
    "# assigned fields can have a different key from their underlying field name\n",
    "indexer.fields['location'] = engine.NestedField('state.city')\n",
    "\n",
    "for doc in docs:\n",
    "    doc['year-month-day'] = doc['incorporated']\n",
    "    point = doc.pop('longitude'), doc.pop('latitude')\n",
    "    location = doc['state'] + '.' + doc['city']\n",
    "    incorporated = map(int, doc.pop('incorporated').split('-'))\n",
    "    indexer.add(doc, location=location, incorporated=date(*incorporated), point=[point])\n",
    "indexer.commit()\n",
    "\n",
    "query = indexer.fields['incorporated'].prefix([1850])\n",
    "[hit['city'] for hit in indexer.search(query)]"
   ],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "query = indexer.fields['incorporated'].range(date(1850, 4, 10), None)\n",
    "[hit['city'] for hit in indexer.search(query)]"
   ],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "query = indexer.fields['year-month-day'].prefix('1850')\n",
    "query"
   ],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "[hit['city'] for hit in indexer.search(query)]"
   ],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "query = indexer.fields['year-month-day'].range('1850-04-10', None)\n",
    "query"
   ],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "[hit['city'] for hit in indexer.search(query)]"
   ],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "query = Q.ranges('population', (0, 1000000))\n",
    "[hit['city'] for hit in indexer.search(query)]"
   ],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "cities = ['San Francisco', 'Los Angeles', 'Portland']\n",
    "for index, distance in enumerate([1e3, 1e5, 7e5, 1e6]):\n",
    "    query = indexer.fields['point'].within(-122.4, 37.7, distance=distance)\n",
    "    print([hit['city'] for hit in indexer.search(query)])"
   ],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "query = indexer.fields['location'].prefix('CA.San')\n",
    "query  # works like any prefix query"
   ],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "[hit['city'] for hit in indexer.search(query)]"
   ],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "query = indexer.fields['location'].prefix('CA')\n",
    "query  # optimized to search the best field"
   ],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "[hit['city'] for hit in indexer.search(query)]"
   ],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## sorting\n",
    "PyLucene has several pitfalls when collecting or sorting a large query result.\n",
    "Generally they involve the overhead of traversing the VM in an internal loop.\n",
    "\n",
    "Lucene also requires supplying a maximum doc count for searches,\n",
    "and supplying an excessively large count is a poor workaround because the collection heap is pre-allocated.\n",
    "\n",
    "To mitigate these problems, Lupyne first provides a unified search interface.\n",
    "The same Hits type is returned regardless of optional doc count or sorting parameters.\n",
    "As with lucene, the result is fully evaluated but each individual Hit object will only be loaded on demand.\n",
    "Internally a CachingCollector is used when all docs are requested.\n",
    "\n",
    "The search method allows lucene Sort parameters to be passed through, since that's still optimal.\n",
    "Additionally the hits themselves can be sorted afterwards with any python callable key.\n",
    "The IndexReader.docvalues method is convenient for creating a sort key table from fields with docvalues.\n",
    "The upshot is custom sorting and sorting large results are both easier and faster.\n",
    "\n",
    "Custom sorting isn't necessary in the below example of course, just there for demonstration.\n",
    "\n",
    "### lucene"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "colors = 'red', 'green', 'blue', 'cyan', 'magenta', 'yellow'\n",
    "indexer = engine.Indexer()\n",
    "indexer.set('color', engine.Field.String, stored=True, docValuesType='sorted')\n",
    "for color in colors:\n",
    "    indexer.add(color=color)\n",
    "indexer.commit()\n",
    "\n",
    "searcher = search.IndexSearcher(indexer.indexReader)\n",
    "sorter = search.Sort(search.SortField('color', search.SortField.Type.STRING))\n",
    "topdocs = searcher.search(search.MatchAllDocsQuery(), 10, sorter)\n",
    "[searcher.doc(scoredoc.doc)['color'] for scoredoc in topdocs.scoreDocs]"
   ],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### lupyne"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "hits = indexer.search(sort='color')\n",
    "[hit['color'] for hit in hits]"
   ],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "docvalues = hits.docvalues('color')\n",
    "docvalues"
   ],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "hits = indexer.search().sorted(docvalues.__getitem__)\n",
    "[hit['color'] for hit in hits]"
   ],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## grouping\n",
    "Lupyne supports lucene's contrib grouping.GroupingSearch interface, but it has some limitations.\n",
    "GroupingSearch objects only support single-valued strings, and won't find zero-valued facets.\n",
    "Lupyne also supports grouping hits by an arbitrary function after the original search.\n",
    "Similar to sorting, the native approach is generally more efficient, proportional to the number of documents culled.\n",
    "\n",
    "Lupyne can also compute facet counts with intersected queries.\n",
    "Although seemingly less efficient, it may be faster with small numbers of terms.\n",
    "It also has no limitations on multiple values, and can be fully customized without reindexing."
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "\n",
    "colors = 'red', 'green', 'blue', 'cyan', 'magenta', 'yellow'\n",
    "facets = dict(zip(colors, itertools.count(1)))\n",
    "indexer = engine.Indexer()\n",
    "indexer.set('color', engine.Field.String, stored=True, docValuesType='sorted')\n",
    "for color in facets:\n",
    "    for _ in range(facets[color]):\n",
    "        indexer.add(color=color)\n",
    "indexer.commit()\n",
    "query = Q.alldocs()"
   ],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Groupby using GroupingSearch."
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "for hits in indexer.groupby('color', query):\n",
    "    assert facets[hits.value] == hits.count\n",
    "    (hit,) = hits\n",
    "    assert hit['color'] == hits.value"
   ],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Groupby using Hits."
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "hits = indexer.search(query)\n",
    "for hits in hits.groupby(hits.docvalues('color').__getitem__, docs=1):\n",
    "    assert facets[hits.value] == hits.count\n",
    "    (hit,) = hits\n",
    "    assert hit['color'] == hits.value"
   ],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Facets using GroupingSearch."
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "indexer.facets(query, 'color')"
   ],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Facets using query counts."
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "queries = {'additive': Q.any(color=colors[:3]), 'subtractive': Q.any(color=colors[3:])}\n",
    "indexer.facets(query, color=queries)"
   ],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}